# Get the file url
fileurl <- "http://ghuang.stat.nctu.edu.tw/course/statmethods16/files/data/samplexprs.csv"
# Import the csv file into R
# note: stringsAsFactors=TRUE will screw up conversion to numeric!
samplexprs <- read.csv(fileurl, header=TRUE, stringsAsFactors=FALSE)
dim(samplexprs)
## [1] 78 4746
head(samplexprs[,1:10])
## id age metastases followup ERp J00129 Contig29982_RC Contig42854
## 1 FG80 52 0 7.35 100 -0.795 -0.387 0.199
## 2 SF58 50 1 1.15 0 -0.509 0.459 -0.257
## 3 DE72 54 0 12.12 100 -0.961 -0.631 0.037
## 4 DE65 40 0 6.25 0 -0.749 0.699 -0.346
## 5 HG87 53 0 5.18 0 -0.426 -0.406 -0.355
## 6 HG88 37 1 1.09 100 -0.566 -0.596 -0.352
## Contig42014_RC Contig27915_RC
## 1 -0.247 0.176
## 2 -0.065 0.129
## 3 -0.153 0.144
## 4 0.032 0.300
## 5 0.429 -0.036
## 6 -0.336 0.037
stem(samplexprs$age)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 2 | 88
## 3 | 00234
## 3 | 677788889999
## 4 | 00111111112333444
## 4 | 55555666678888888999
## 5 | 0012222222333444444444
stem(samplexprs$J0012)
##
## The decimal point is at the |
##
## -2 | 0
## -1 |
## -1 | 321000
## -0 | 9999998888888888777777776666666666666665555555555555
## -0 | 4444444433221
## 0 | 01
## 0 | 689
## 1 | 2
For q-q plot, we can first compute the percentiles for our list of data (ages/J0012) and for the normal distribution, and then compare them.
library(rafalib)
# plot for age
x <- samplexprs$age
ps <- ( seq(0,99) + 0.5 )/100
qs <- quantile(x, ps)
normalqs <- qnorm(ps, mean(x), popsd(x))
plot(normalqs,qs,xlab="Normal percentiles",ylab="Age percentiles")
abline(0,1, col="red") ##identity line
# plot for J0012
y <- samplexprs$J0012
ps <- ( seq(0,99) + 0.5 )/100
qs <- quantile(y, ps)
normalqs <- qnorm(ps, mean(y), popsd(y))
plot(normalqs,qs,xlab="Normal percentiles",ylab="J0012 percentiles")
abline(0,1, col="red") ##identity line
We can also see the plot with the existing function qqnorm.
qqnorm(x); qqline(x, col="red")
qqnorm(y); qqline(y, col="red")
Notice that the qqnorm function plots against a standard normal distribution. This is why the line has slope popsd(x) (or popsd(y)) and intercept mean(x) (or mean(y)).
In fact, we can run Monte Carlo simulations to see plots like this for data known to be normally distributed.
n <-1000
x <- rnorm(n)
qqnorm(x)
qqline(x, col="red")
We can also get a sense for how non-normally distributed data will look in a qq-plot. Here we generate data from the t-distribution with different degrees of freedom. Notice that the smaller the degrees of freedom, the fatter the tails. We call these “fat tails” because if we plotted an empirical density or histogram, the density at the extremes would be higher than the theoretical curve. In the qq-plot, this can be seen in that the curve is lower than the identity line on the left side and higher on the right side. This means that there are more extreme values than predicted by the theoretical density plotted on the x-axis.
dfs <- c(3,6,12,30)
mypar(2,2)
for(df in dfs){
x <- rt(1000,df)
qqnorm(x,ylab="t quantiles",main=paste0("d.f.=",df),ylim=c(-6,6))
qqline(x, col="red")
}
# Install package "ggplot2" first, then load it into R
library(ggplot2)
# with qplot
qplot(J00129, data=samplexprs, geom="histogram", binwidth=0.1)
qplot(J00129, data=samplexprs, geom="histogram", binwidth=0.05)
# with ggplot
h <- ggplot(samplexprs, aes(x=J00129))
h + geom_histogram(binwidth=0.05)
# display a smooth density estimate
h + geom_histogram(aes(y = ..density..), binwidth=0.05) + geom_density()
qplot(x=factor(1), y=J00129, data=samplexprs, geom="boxplot", xlab="")
b <- ggplot(samplexprs, aes(x=factor(1), y=J00129))
b + geom_boxplot() + labs(x="")
b + geom_boxplot() + labs(x="") + coord_flip()
b + geom_boxplot(fill="grey80", colour="#3366FF") + labs(x="") + coord_flip()
ba <- ggplot(samplexprs, aes(x=factor(ERp)))
ba + geom_bar()
ba + geom_bar(fill="white", colour="darkgreen")
qplot(factor(ERp), data=samplexprs, geom="bar", fill=factor(ERp))
p <- ggplot(samplexprs, aes(x=factor(1), fill=factor(ERp)))
p + geom_bar(width=1, position="fill") + coord_polar(theta="y")
s <- ggplot(samplexprs, aes(J00129, Contig29982_RC))
s + geom_point()
# Add trend line
s + geom_point() + stat_smooth()
## `geom_smooth()` using method = 'loess'
# Get the file url
fileurl <- "http://ghuang.stat.nctu.edu.tw/course/statmethods16/files/data/exprs_sig.csv"
# Import the csv file into R
# note: stringsAsFactors=TRUE will screw up conversion to numeric!
exprs_sig <- read.csv(fileurl, header=TRUE, stringsAsFactors=FALSE)
# note: read.csv treats the 1st column of exprs_sig.csv as the data, not the rownames,
# we have to correct this
# get the 1st column as the rownames
names1 <- exprs_sig[,1]
# remove the 1st column from exprs_sig
exprs_sig <- exprs_sig[,-1]
# assign the rownames
rownames(exprs_sig) <- names1
colnames(exprs_sig)
## [1] "FG80" "SF58" "DE72" "DE65" "HG87" "HG88" "AB22" "HG91" "HG92" "KH11"
## [11] "KH20" "SF67" "LD44" "AA04" "AA01" "GL73" "AA10" "HG86" "DE62" "AB26"
## [21] "SF57" "DE61" "DD41" "AB31" "LD30" "SF66" "GL75" "AA05" "LD37" "FG78"
## [31] "DD42" "DD44" "KH99" "SF60" "DE66" "FG79" "FG83" "DD50" "DD46" "LD56"
## [41] "AA15" "AA02" "FG77" "HG93" "GL77" "HG90" "DD40" "GL71" "DD49" "LD33"
## [51] "KH15" "FG81" "DE71" "FG82" "DE73" "HG85" "LD45" "KH22" "LD35" "AA16"
## [61] "AA12" "DE63" "LD39" "DE70" "AB32" "AB20" "HG89" "SF69" "LD27" "AB24"
## [71] "AA03" "GL84" "DD43" "DD52" "FG75" "GL72" "KH14" "KH17"
dim(exprs_sig)
## [1] 4741 78
head(exprs_sig[,1:10])
## FG80 SF58 DE72 DE65 HG87 HG88 AB22 HG91
## J00129 -0.795 -0.509 -0.961 -0.749 -0.426 -0.566 -0.420 -0.499
## Contig29982_RC -0.387 0.459 -0.631 0.699 -0.406 -0.596 -0.286 -0.402
## Contig42854 0.199 -0.257 0.037 -0.346 -0.355 -0.352 -0.090 0.181
## Contig42014_RC -0.247 -0.065 -0.153 0.032 0.429 -0.336 -0.048 0.143
## Contig27915_RC 0.176 0.129 0.144 0.300 -0.036 0.037 0.291 -0.268
## Contig20156_RC -0.129 0.009 -0.202 -0.025 0.191 -0.147 -0.166 0.849
## HG92 KH11
## J00129 -0.465 -0.189
## Contig29982_RC -0.533 -0.309
## Contig42854 -0.019 -0.152
## Contig42014_RC 0.019 0.918
## Contig27915_RC -0.388 0.585
## Contig20156_RC -0.012 -0.144
sh <- ggplot(exprs_sig, aes(AA01, SF58))
sh + geom_point()
# Varying alpha is useful for large datasets
sh + geom_point(alpha = 0.3)
# Add trend line
sh + geom_point(alpha = 0.3) + stat_smooth()
## `geom_smooth()` using method = 'gam'
# Or adding heatmap of 2d bin
sh + geom_bin2d()
# With qplot
qplot(x=factor(metastases), y=J00129, data=samplexprs, geom="boxplot")
qplot(x=factor(ERp), y=J00129, data=samplexprs, geom="boxplot")
# With ggplot
b2 <- ggplot(samplexprs, aes(x=factor(metastases), y=J00129))
b2 + geom_boxplot()
b2 <- ggplot(samplexprs, aes(x=factor(ERp), y=J00129))
b2 + geom_boxplot()
sba <- ggplot(samplexprs, aes(x=factor(ERp), fill=factor(metastases)))
# y is count
sba + geom_bar()
sba + geom_bar() + coord_flip()
# y is proportion
sba + geom_bar(position="fill") + labs(y="proportion")
fba <- ggplot(samplexprs, aes(x=factor(ERp)))
fba + geom_bar() + facet_wrap(~metastases)
# For ERp+metastases
# Create 2x2 table for ERp vs. metastases and obtain counts
a<-xtabs(~ERp+metastases, data=samplexprs)
b<-data.frame(b1=c(a), b2=rep(as.numeric(colnames(a)), rep(dim(a)[1],dim(a)[2])), b3=rep(as.numeric(rownames(a)),dim(a)[2]))
# Stacked area chart
sa <- ggplot(b, aes(x=b3, y=b1, fill=factor(b2)))
sa + geom_area() + labs(x="ERp", y="count", fill="metastases")
sa + geom_area(position="fill") + labs(x="ERp", y="proportion", fill="metastases")
# For age+metastases
# Create 2x2 table for age vs. metastases and obtain counts
a<-xtabs(~age+metastases, data=samplexprs)
b<-data.frame(b1=c(a), b2=rep(as.numeric(colnames(a)), rep(dim(a)[1],dim(a)[2])), b3=rep(as.numeric(rownames(a)),dim(a)[2]))
# Stacked area chart
sa <- ggplot(b, aes(x=b3, y=b1, fill=factor(b2)))
sa + geom_area() + labs(x="age", y="count", fill="metastases")
sa + geom_area(position="fill") + labs(x="age", y="proportion", fill="metastases")
# Connect observations, ordered by x value
t <- ggplot(samplexprs, aes(x=age, y=J00129, group=metastases))
t + geom_line(aes(colour=factor(metastases)), size=1)
# Install the package "lattice", then load it into R
library(lattice)
# 3d scatterplot
cloud(age~J00129+Contig29982_RC, data=samplexprs)
cloud(AA01~SF58+LD37, data=exprs_sig)
# lattice in the 3rd dim
xyplot(Contig29982_RC~J00129 | factor(metastases), data=samplexprs)
xyplot(Contig29982_RC~J00129 | factor(ERp), data=samplexprs)
# map the 3rd dim to colors
d3c <- ggplot(samplexprs, aes(J00129, Contig29982_RC))
d3c + geom_point(aes(colour=age))
d3c + geom_point(aes(colour=factor(metastases)))
d3c + geom_point(aes(colour=factor(ERp)))
# lay out panels in the 3rd dim
d3c + geom_point() + facet_grid(.~metastases)
d3c + geom_point() + facet_grid(.~ERp)
pairs(samplexprs[,c('age','ERp','J00129','Contig29982_RC')])
pairs(samplexprs[,c('age','ERp','J00129','Contig29982_RC')], col=as.integer(samplexprs[,'metastases'])+1)
pairs(samplexprs[,c('age','ERp','J00129','Contig29982_RC')], col=as.integer(samplexprs[,'metastases'])+1,
panel=panel.smooth)
# Heatmap
a <- exprs_sig[sample(1:dim(exprs_sig)[1], size=70),]
# no dendrogram
heatmap(as.matrix(a), Rowv=NA, Colv=NA,
xlab="patients", ylab="genes",
cexRow=0.5, cexCol=0.6)
# with dendrogram
heatmap(as.matrix(a),
xlab="patients", ylab="genes",
cexRow=0.5, cexCol=0.6)
# with different color schemes
# install the package "RColorBrewer", then load it into R
library(RColorBrewer)
heatmap(as.matrix(a), col=brewer.pal(9, "BuGn"),
xlab="patients", ylab="genes",
cexRow=0.5, cexCol=0.6)
# use heatmap.2 and self-set dendrogram
# install packages "graphics", "gplots", "RColorBrewer"", then load them into R
library(graphics)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(RColorBrewer)
a <- exprs_sig[sample(1:dim(exprs_sig)[1], size=70),]
hcpat <- hclust(dist(t(a)), method="average")
dendpat <- as.dendrogram(hcpat)
hcgene <- hclust(as.dist(1-cor(t(a))), method="average")
dendgene <- as.dendrogram(hcgene)
# create a blue -> purple colour palette
jBuPuFun <- colorRampPalette(brewer.pal(n = 9, "BuPu"))
paletteSize <- 256
jBuPuPalette <- jBuPuFun(paletteSize)
heatmap.2(as.matrix(a), Rowv=dendgene, Colv=dendpat, col=jBuPuPalette,
xlab="patients", ylab="genes", scale="none", trace="none",
cexRow=0.5, cexCol=0.6, density.info="none")